Python modules used.
if True:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import datetime
import numpy as np
import math
import reverse_geocoder as rg
from scipy.stats import norm
from scipy.stats import expon
from scipy.stats import lognorm
from scipy.stats import weibull_min
from scipy.stats import weibull_max
from scipy.stats import alpha
from scipy.stats import beta
import scipy.stats as stats
%matplotlib inline
from scipy import stats
from fitter import Fitter
from IPython.display import Image
#plotly.offline.init_notebook_mode(connected=True)
py.init_notebook_mode(connected=False)
#ignore deprication warnings
import warnings
warnings.filterwarnings('ignore')
import operator
import collections
from scipy.optimize import minimize
def objfun(x,*args):
#x = params
params=x
#*args = dist, data
dist = args[0]
data = args[1]
weights = args[2]
fit = [dist.cdf(j, *params) for j in sorted(weights)]
err = checkfit(data,fit)
return err
def checkfit(data,fit):
err=0
for i in range(len(data)):
err+=(data[i]-fit[i])**2
return err
For data reports: https://www.nohrsc.noaa.gov/nsa/reports.html
For definition of terms https://www.nohrsc.noaa.gov/help/
#read in the Surface Water Equivalent Data
SWE = pd.read_csv('swe.csv')
if True:
# remove some unused columns
SWE = SWE.drop(["Unnamed: 0","Unnamed: 0.1", "Unnamed: 10", "Unnamed: 9","Zip_Code"], axis=1)
# Add a column that counts the number of entries from each station
SWE['StationCounts'] = SWE.groupby(['Station_Id'])['Station_Id'].transform('count')
# Add a column that records the time of first data for each station
SWE['StartTime'] = SWE.groupby(['Station_Id'])['DateTime_Report(UTC)'].transform('min')
# Make a copy of the dataframe with only unique stations for plotting on a map.
SWE_Stations = SWE.drop_duplicates('Station_Id',keep='first')
#plot the stations on a map
if True:
#fig = px.scatter_mapbox(SWE_Stations, lon="Longitude", lat="Latitude", hover_name="Station_Id", hover_data=["Station_Id","StationCounts"],
# zoom=4, height=300,color='StartTime',color_continuous_scale=px.colors.sequential.Viridis)
fig = px.scatter_mapbox(SWE_Stations, lon="Longitude", lat="Latitude", hover_name="Station_Id", hover_data=["Station_Id","StartTime"],
zoom=4, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print("Plot of All SWE Data Sites in the Northeast")
fig.update_layout(showlegend=False)
fig.show()
#remove sites with fewer than 100 total data points
mincount = 100
if True:
#remove the zeros
SWE = SWE[SWE.Amount>0]
# add columns for year,month,day
SWE['year'] = pd.DatetimeIndex(SWE['DateTime_Report(UTC)']).year
SWE['month'] = pd.DatetimeIndex(SWE['DateTime_Report(UTC)']).month
SWE['day'] = pd.DatetimeIndex(SWE['DateTime_Report(UTC)']).day
# throw away the stations with less than 'mincount' data points?
SWE = SWE[SWE['StationCounts']>mincount]
SWE_Stations = SWE.drop_duplicates('Station_Id',keep='first')
#drop all stations outside of MA
if True:
SWE_Stations = SWE_Stations[SWE_Stations['Latitude']<42.8]
SWE_Stations = SWE_Stations[SWE_Stations['Latitude']>41.9]
SWE_Stations = SWE_Stations[SWE_Stations['Longitude']>-73]
if False:
#THIS TAKES A WHILE, ONLY USED FOR SHOWING THE MAP PLOT OF STATIONS
# uses reverse geocoding..
# not really necessary here, just use coordinates.
rows_to_delete=[]
for i, row in enumerate(SWE_Stations.itertuples(), 1):
#print(i,row.Index)
coords = (SWE_Stations.iloc[i-1]['Latitude'],SWE_Stations.iloc[i-1]['Longitude'])
results = rg.search(coords)
if results[0]['admin1'] != "Massachusetts":
rows_to_delete.append(row.Index)
SWE_Stations = SWE_Stations.drop(rows_to_delete)
#add columns for the initial date of each station
SWE_Stations['StartDate'] = pd.to_datetime(SWE_Stations['StartTime'])
SWE_Stations['epoch'] = pd.to_numeric(SWE_Stations['StartDate'])
#plot the stations on a map
if True:
mapbox_access_token = 'pk.eyJ1IjoidGZnMjUwIiwiYSI6ImNrM2FoOXJvMjBjaHEzZHJ0MjdoczNxczUifQ.NdBJ3-eSMYknzg_vfPm0Qg'
length = max(SWE_Stations['epoch']) - min(SWE_Stations['epoch'])
tickvals = [min(SWE_Stations['epoch']) + (length/12)*i for i in range(0,13)]
ticktext = list(range(2007,2020))
txt = ["Station_Id: "+str(i)+"<br>" +"StationCounts: "+ str(j) for i, j in zip(list(SWE_Stations['Station_Id']), list(SWE_Stations['StationCounts']))]
fig = go.Figure()
fig.add_trace(go.Scattermapbox(text = txt,hoverinfo = 'text',lon=SWE_Stations["Longitude"],
lat=SWE_Stations["Latitude"],mode='markers',marker=go.scattermapbox.Marker(opacity=1,
size=SWE_Stations['StationCounts']/40.,color=SWE_Stations['epoch'],colorscale='viridis',
colorbar={'title':'Start Date','tickmode':'array','tickvals':tickvals,'ticktext':ticktext})))
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(hovermode='closest',
mapbox=go.layout.Mapbox(
accesstoken=mapbox_access_token,
bearing=0,
center=go.layout.mapbox.Center(
lat=42.4,
lon=-72),
pitch=0,
zoom=7))
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print("Plot of sites in MA with at least " + str(mincount)+" Data Points")
print("color is the start date, marker size is the number of data points")
#fig.update_layout(showlegend=False)
fig.show()
#GET MONTHLY SWE MAXIMUMS (For each station)
#The entire row that governs is carried forward
SELECTED_STATIONS = ['GRFM3']
#SELECTED_STATIONS = ['ABNM3']
#SELECTED_STATIONS = ['LWMM3']
#SELECTED_STATIONS = ['MILM3']
#SELECTED_STATIONS = ['HRKM3']
#SELECTED_STATIONS = ['ASHM3']
#SELECTED_STATIONS = ['NRNM3']
#SELECTED_STATIONS = ['GRFM3','ABNM3','LWMM3','MILM3','HRKM3','ASHM3','NRNM3']
#SELECTED_STATIONS = list(SWE_Stations['Station_Id'])
if True:
MONTHLY_DATA=pd.DataFrame()
ANNUAL_DATA=pd.DataFrame()
#for all stations?
#for station in set(SWE_Stations['Station_Id']):
#select the station with the most data
#for station in SELECTED_STATIONS:
if True:
for yr in range(2006,2020):
tmp1 = SWE[(SWE['year']==yr) & (SWE.Station_Id.isin(SELECTED_STATIONS))]
try:
ANNUAL_DATA = ANNUAL_DATA.append(tmp1.loc[tmp1['Amount'].idxmax()])
except:
pass
for month in range(1,13):
tmp2 = SWE[(SWE['year']==yr) & (SWE['month']==month) & (SWE.Station_Id.isin(SELECTED_STATIONS))]
try:
tmp2['AnnualData'] = max(max(tmp1["Amount"]),max(tmp2['Amount']))
MONTHLY_DATA = MONTHLY_DATA.append(tmp2.loc[tmp2['Amount'].idxmax()])
except:
pass
#limit monthly data to full years
#MONTHLY_DATA = MONTHLY_DATA[(MONTHLY_DATA['year']>2007)]
#drop some columns no longer needed
ANNUAL_DATA = ANNUAL_DATA.drop(["StationCounts","Latitude", "Longitude"], axis=1)
MONTHLY_DATA = MONTHLY_DATA.drop(["StationCounts","Latitude", "Longitude"], axis=1)
#ANNUAL_DATA
#for the purposes of plotting put in zeros for months without data
if True:
for yr in range(2007,2020):
try:
tmp1 = ANNUAL_DATA[(ANNUAL_DATA['year']==yr)]
#print(tmp1,"\n\n")
for month in range(1,13):
tmp = MONTHLY_DATA[(MONTHLY_DATA['year']==yr) & (MONTHLY_DATA['month']==month)]
#print(tmp.shape[0])
if tmp.shape[0]==0:
#add a zero.
#'2006-11-25 16'
dfadd = pd.DataFrame([[yr,month,15,0,str(yr)+"-"+"%02i" %month+"-01 00",float(list(tmp1['Amount'])[0])]], columns = ['year','month','day','Amount','DateTime_Report(UTC)','AnnualData'])
MONTHLY_DATA=MONTHLY_DATA.append(dfadd)
except:
pass
MONTHLY_DATA = MONTHLY_DATA.sort_values(by=['DateTime_Report(UTC)'])
#MONTHLY_DATA
#Plot the data over time
if True:
fig = go.Figure()
fig.add_trace(go.Line(x=MONTHLY_DATA['DateTime_Report(UTC)'], y=MONTHLY_DATA['Amount'],
mode='lines',marker=dict(color='red',size=8),line_shape='hvh',
name='Monthly Maximum Data'))
fig.add_trace(go.Line(x=MONTHLY_DATA['DateTime_Report(UTC)'], y=MONTHLY_DATA['AnnualData'],
mode='lines',marker=dict(color='black',size=8),line_shape='hvh',
name='Annual Maximum Data'))
fig.update_layout(
title="Annual & Monthly Maximum SWE Data",
xaxis_title="Date",
yaxis_title="SWE (inches of water)",
template='plotly_white')
fig.show()
#remove the zeros
if True:
ANNUAL_DATA = ANNUAL_DATA[ANNUAL_DATA.Amount>0]
MONTHLY_DATA = MONTHLY_DATA[MONTHLY_DATA.Amount>0]
if True:
swe_vals_annual = sorted(list(ANNUAL_DATA["Amount"]))
h20 = 5.202288 #psf per inch of depth
weight_vals_annual = [i*h20 for i in swe_vals_annual]
n=len(weight_vals_annual)
if True:
data_array = np.asarray(weight_vals_annual)
f = Fitter(data_array,verbose=False)
f.fit()
f.summary()
showbest = 5
if True:
DISTRIBUTIONS = [stats.alpha,stats.anglit,stats.arcsine,stats.argus,stats.beta,stats.betaprime,stats.bradford,stats.burr,stats.burr12,stats.cauchy,stats.chi,stats.chi2,stats.cosine,stats.crystalball,stats.dgamma,stats.dweibull,stats.erlang,stats.expon,stats.exponnorm,stats.exponpow,stats.exponweib,stats.f,stats.fatiguelife,stats.fisk,stats.foldcauchy,stats.foldnorm,stats.frechet_l,stats.frechet_r,stats.gamma,stats.gausshyper,stats.genexpon,stats.genextreme,stats.gengamma,stats.genhalflogistic,stats.genlogistic,stats.gennorm,stats.genpareto,stats.gilbrat,stats.gompertz,stats.gumbel_l,stats.gumbel_r,stats.halfcauchy,stats.halfgennorm,stats.halflogistic,stats.halfnorm,stats.hypsecant,stats.invgamma,stats.invgauss,stats.invweibull,stats.johnsonsb,stats.johnsonsu,stats.kappa3,stats.kappa4,stats.ksone,stats.kstwobign,stats.laplace,stats.levy,stats.levy_l,stats.levy_stable,stats.loggamma,stats.logistic,stats.loglaplace,stats.lognorm,stats.lomax,stats.maxwell,stats.mielke,stats.moyal,stats.nakagami,stats.ncf,stats.nct,stats.ncx2,stats.norm,stats.norminvgauss,stats.pareto,stats.pearson3,stats.powerlaw,stats.powerlognorm,stats.powernorm,stats.rayleigh,stats.rdist,stats.recipinvgauss,stats.reciprocal,stats.rice,stats.rv_continuous,stats.rv_histogram,stats.semicircular,stats.skewnorm,stats.t,stats.trapz,stats.triang,stats.truncexpon,stats.truncnorm,stats.tukeylambda,stats.uniform,stats.vonmises,stats.vonmises_line,stats.wald,stats.weibull_max,stats.weibull_min,stats.wrapcauchy]
DIST_NAMES = ['alpha','anglit','arcsine','argus','beta','betaprime','bradford','burr','burr12','cauchy','chi','chi2','cosine','crystalball','dgamma','dweibull','erlang','expon','exponnorm','exponpow','exponweib','f','fatiguelife','fisk','foldcauchy','foldnorm','frechet_l','frechet_r','gamma','gausshyper','genexpon','genextreme','gengamma','genhalflogistic','genlogistic','gennorm','genpareto','gilbrat','gompertz','gumbel_l','gumbel_r','halfcauchy','halfgennorm','halflogistic','halfnorm','hypsecant','invgamma','invgauss','invweibull','johnsonsb','johnsonsu','kappa3','kappa4','ksone','kstwobign','laplace','levy','levy_l','levy_stable','loggamma','logistic','loglaplace','lognorm','lomax','maxwell','mielke','moyal','nakagami','ncf','nct','ncx2','norm','norminvgauss','pareto','pearson3','powerlaw','powerlognorm','powernorm','rayleigh','rdist','recipinvgauss','reciprocal','rice','rv_continuous','rv_histogram','semicircular','skewnorm','t','trapz','triang','truncexpon','truncnorm','tukeylambda','uniform','vonmises','vonmises_line','wald','weibull_max','weibull_min','wrapcauchy']
skips = []
fitter_errors={}
min_error=9e99
for i in range(len(DIST_NAMES)):
#print(DIST_NAMES[i])
if DIST_NAMES[i] not in skips:
fitter_errors[DIST_NAMES[i]] = f.df_errors['sumsquare_error'][DIST_NAMES[i]]
min_error = min(min_error,f.df_errors['sumsquare_error'][DIST_NAMES[i]])
ErrorThreshold = np.sort(list(fitter_errors.values()))[showbest]
print('ErrorThreshold=',"%.4f" %ErrorThreshold)
fig = go.Figure()
n = len(weight_vals_annual)
Pvals_annual = []
for i in range(len(sorted(weight_vals_annual))):
Pvals_annual.append((i+1)/(n+1))
for i in range(len(DIST_NAMES)):
#print(DIST_NAMES[i])
if DIST_NAMES[i] not in skips:
error = f.df_errors['sumsquare_error'][DIST_NAMES[i]]
if np.isnan(error) or error>ErrorThreshold:
visible='legendonly'
else:
visible=True
try:
name = DIST_NAMES[i]
dist = DISTRIBUTIONS[i]
params = f.fitted_param[name]
yy = x=np.linspace(0.001,0.999,98)
xx = [dist.ppf(i, *params) for i in yy]
fig.add_trace(go.Scatter(x=xx, y=yy, line=dict(width=1),hoverinfo='name+text',
mode='lines',hovertext="%1.3g" % error,visible=visible,name=name+" "+"%1.3g" % error))
except KeyError:
pass
fig.add_trace(go.Scatter(x=sorted(weight_vals_annual), y=Pvals_annual,
mode='markers',marker=dict(color='black',size=8),
name='data'))
fig.update_layout(
title="Fitting Distributions to Annual Maximum Data Using Python FITTER Module",
xaxis_title="Annual Maximum Measured Snow Weight (psf)",
yaxis_title="Probability",
template='plotly_white')
fig.update_xaxes(range=[0, 1.5*max(weight_vals_annual)])
fig.show()
#use these distributions to estimate the 50yr MRI event.
if True:
best_fits=[]
for i in range(len(DIST_NAMES)):
error = f.df_errors['sumsquare_error'][DIST_NAMES[i]]
if np.isnan(error) or error<=ErrorThreshold:
name = DIST_NAMES[i]
dist = DISTRIBUTIONS[i]
params = f.fitted_param[name]
best_fits.append(name)
yy = 0.98
#print(months_per_year,yy,"\n\n")
print("\t\t50yr MRI Snow Load from Monthly Maximums")
for fit in best_fits:
i = DIST_NAMES.index(fit)
dist = DISTRIBUTIONS[i]
params = f.fitted_param[fit]
xx = dist.ppf(0.98, *params)
if not np.isnan(xx) and not np.isinf(xx):
print("%20s"%fit,"\t","%.2f" %xx,"psf")
These don't appear to fit very well, since the fit is based on the histogram (pdf) and there isn't enough data here to make a decent histogram. So solve for optimized parameters using my own objective function that compares CDF values.
Set up an optimization function based on error between the data and the distribution in terms of CDF. Run scipy optimize to find the best fitting distribution parameters
#THIS TAKES A WHILE...
if True:
ERROR={}
NEW_PARAMS={}
for i in range(len(DIST_NAMES)):
if DIST_NAMES[i] in f.fitted_param:
name = DIST_NAMES[i]
dist = DISTRIBUTIONS[i]
params = f.fitted_param[name]
sol = minimize(objfun, params, args=(dist,Pvals_annual,weight_vals_annual))
error = sol.fun
newparams = sol.x
ERROR[name] = error
NEW_PARAMS[name] = newparams
#except:
# pass
ERROR = sorted(ERROR.items(), key=operator.itemgetter(1))
ERROR = collections.OrderedDict(ERROR)
#plot the new fits
showbest=5
showall='legendonly'
if True:
#ErrorThreshold=1.25*min(ERROR.values())
ErrorThreshold = np.sort(list(ERROR.values()))[showbest]
print('ErrorThreshold=',"%.4f" %ErrorThreshold)
fig = go.Figure()
skips = []
for i in range(len(DIST_NAMES)):
#try:
if True:
name = DIST_NAMES[i]
if name not in skips and name in ERROR:
error = ERROR[name]
if np.isnan(error) or error>ErrorThreshold:
#visible='legendonly'
visible=showall
else:
visible=True
dist = DISTRIBUTIONS[i]
params = NEW_PARAMS[name]
yy = x=np.linspace(0.001,0.999,98)
xx = [dist.ppf(i, *params) for i in yy]
fig.add_trace(go.Scatter(x=xx, y=yy, line=dict(width=1),hoverinfo='name+text',
mode='lines',hovertext="%.4f" % error,visible=visible,name=name+" "+"%1.3g" % error))
fig.add_trace(go.Scatter(x=sorted(weight_vals_annual), y=Pvals_annual,
mode='markers',marker=dict(color='black',size=8),
name='data from station: '+SELECTED_STATIONS[0]))
fig.update_layout(
title="Fitting Distributions to Annual Maximum Data with Optimization on CDF",
xaxis_title="Annual Maximum Measured Snow Weight (psf)",
yaxis_title="Probability",
template='plotly_white')
fig.update_xaxes(range=[0, 1.5*max(weight_vals_annual)])
fig.show()
#use the best fit distributions to estimate the 50yr MRI event.
if True:
best_fits=[]
for i in range(len(DIST_NAMES)):
try:
if DIST_NAMES[i] not in skips:
#error = f.df_errors['sumsquare_error'][DIST_NAMES[i]]
error = ERROR[DIST_NAMES[i]]
#print()
if not np.isnan(error) and error<ErrorThreshold:
#print(DIST_NAMES[i],error)
best_fits.append(DIST_NAMES[i])
except KeyError:
pass
#print(best_fits)
print("50yr MRI Snow Load from Annual Maximums")
for fit in best_fits:
i = DIST_NAMES.index(fit)
dist = DISTRIBUTIONS[i]
params = NEW_PARAMS[fit]
yy = 0.98
xx = dist.ppf(yy, *params)
print("%20s"%fit,"\t","%.2f" %xx,"psf")
#repeat with monthly data
if True:
swe_vals_monthly = sorted(list(MONTHLY_DATA["Amount"]))
h20 = 5.202288 #psf per inch of depth
weight_vals_monthly = [i*h20 for i in swe_vals_monthly]
n=len(weight_vals_monthly)
#compute fit parameters
if True:
Pvals_monthly = []
X = []
n = len(weight_vals_monthly)
for i in range(len(sorted(weight_vals_monthly))):
Pvals_monthly.append(i/(n+1))
X.append(weight_vals_monthly[i])
ERROR_monthly={}
NEW_PARAMS_monthly={}
for i in range(len(DIST_NAMES)):
try:
name = DIST_NAMES[i]
dist = DISTRIBUTIONS[i]
params = f.fitted_param[name]
sol = minimize(objfun, params, args=(dist,Pvals_monthly,weight_vals_monthly))
error = sol.fun
newparams = sol.x
ERROR_monthly[name] = error
NEW_PARAMS_monthly[name] = newparams
except:
pass
ERROR_monthly = sorted(ERROR_monthly.items(), key=operator.itemgetter(1))
ERROR_monthly = collections.OrderedDict(ERROR_monthly)
#plot the best fit distributions
if True:
ErrorThreshold = np.sort(list(ERROR_monthly.values()))[showbest]
print('ErrorThreshold=',"%.4f" %ErrorThreshold)
fig = go.Figure()
skips = []
for i in range(len(DIST_NAMES)):
name = DIST_NAMES[i]
if name not in skips and name in ERROR_monthly:
error = ERROR_monthly[name]
if np.isnan(error) or error>ErrorThreshold:
#visible='legendonly'
visible = showall
else:
visible=True
dist = DISTRIBUTIONS[i]
params = NEW_PARAMS_monthly[name]
yy = np.linspace(0.001,0.999,98)
xx = [dist.ppf(i, *params) for i in yy]
fig.add_trace(go.Scatter(x=xx, y=yy, line=dict(width=1),hoverinfo='name+text',
mode='lines',hovertext="%.4f" % error,visible=visible,name=name+" "+"%1.3g" % error))
fig.add_trace(go.Scatter(x=sorted(weight_vals_monthly), y=Pvals_monthly,
mode='markers',marker=dict(color='black',size=8),
name='data from station: GRFM3'))
fig.update_layout(
title="Fitting Distributions to Monthly Maximum Data with Optimization on CDF",
xaxis_title="Monthly Maximum Measured Snow Weight (psf)",
yaxis_title="Probability",
template='plotly_white')
fig.update_xaxes(range=[0, 1.5*max(weight_vals_annual)])
fig.show()
#use these distributions to estimate the 50yr MRI event.
if True:
best_fits=[]
for i in range(len(DIST_NAMES)):
name = DIST_NAMES[i]
if DIST_NAMES[i] not in skips and name in ERROR_monthly:
error = ERROR_monthly[name]
if not np.isnan(error) and error<ErrorThreshold:
best_fits.append(DIST_NAMES[i])
#annual
# 0.98 = 1 - (1/50)
#monthly (considering 12 months per year)
# 0.9983 = 1 - (1/(12*50)
#monthly (considering 2.4 months per year)
# 0.9917 = 1 - (1/(2.4*50)
months_per_year = MONTHLY_DATA.shape[0]/(MONTHLY_DATA['year'].max()-MONTHLY_DATA['year'].min()+1)
yy = 1 - (1/(months_per_year*50))
#print(months_per_year,yy,"\n\n")
print("\t\t50yr MRI Snow Load from Monthly Maximums")
for fit in best_fits:
i = DIST_NAMES.index(fit)
dist = DISTRIBUTIONS[i]
params = NEW_PARAMS_monthly[fit]
xx = dist.ppf(yy, *params)
print("%20s"%fit,"\t","%.2f" %xx,"psf")